Abstract

When using datasets gathered from mediums designed for casual conversation such as twitter, the problem with these datasets is the large amount of sarcasm present in these datasets. With sarcasm being difficult to detect by humans and Natural Language processing models this can hinder the model’s accuracy. As a result, we hoped to create an NLP model specifically designed to detect sarcasm for other NLP models. To do this, two datasets will be utilized, ISarcasm and Tweets with Sarcasm and Irony. In addition, these datasets will be trained on with five different models; Recurrent Neural Networks (RNN), Support Vector Machine (SVM), Random Forests, Decision Trees, and XGboost. By combining our model with other NLP models, we hope to increase the accuracy of these models. 

Dataset

Visualizations

Retrieve Data

  • The Dataset already has the train and test separated so no need to do it manually
Dir <- 'Datasets/Tweets_with_Sarcasm_and_Irony'
train <- read_csv(paste0(Dir,"/train.csv"))
## Rows: 81408 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): tweets, class
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
test <- read_csv(paste0(Dir,"/test.csv"))
## Rows: 8128 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): tweets, class
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#test <- test %>% filter(!is.na(class))
#train$tweets <- map(train$tweets, .f = function(x){
#  str_squish(x)
#}) %>% unlist()

Some Exploration

  • Shows how many observation exists for each dataset
classes <- train$class %>% unique()
num_obs_train <- nrow(train)
num_obs_test <- nrow(test)

Classes: figurative, irony, regular, sarcasm
Number of observations in Train: 81408
Number of observations in Test: 8128

  • Demonstrates the number of different classes exists
t <- train %>% group_by(class) %>% count()
t
  • View the missing data, we consider empty strings as missing data as well
train %>% filter(tweets == "" | tweets == " " | is.na(tweets))
  • Visualize tweet length range
tweet_lengths <- train$tweets %>% map(
              .f = function(x){
  str_count(x, pattern = " ") + 1
}) %>% unlist()

ids <- 1:nrow(train)
train_temp <- train %>% mutate(tweet_length = tweet_lengths,
                               id = ids)

train_temp %>% ggplot(aes(x = tweet_lengths)) + 
  geom_bar(aes(fill = after_stat(count)))

  • Based on the graphic, it shows a normal distribution of tweet lengths
  • Tweet length range visualized with box plot
train_temp %>% ggplot(aes(y = tweet_lengths)) + 
  geom_boxplot()

*Tweet length range visualized with boxplot for each class

train_temp %>% ggplot(aes(x = class, y = tweet_lengths)) + 
  geom_boxplot()

Max tweet length: 67
Min tweet length: 1
Mean tweet length: 15.1798595

Max Tweet Length

t <- train_temp %>% filter(tweet_lengths == max(tweet_lengths))
t

Min Tweet Length

t <- train_temp %>% filter(tweet_lengths == min(tweet_lengths))
t

Visualization/transformation Functions

  • Some functions for transformation, most notable, get_hashtags_df mutates hashtags from tweets into a new column.
  • Takes in a list of strings
#------------------------------------------------------------------------------
#Function just so i don't loose my mind waiting for a function to finish
#P: Makes sure function does not print the same percentage: initialize p = 0 
#outside the loop
#Length: How long the loop is
#i: the iterator
print_percent <- function(i, length, p) {
  percent <- floor((i/length * 100))
  if(percent %% 10 == 0 && p != percent){
      print(paste0(percent,"% Complete"))
      p = percent
    }
    return(p)
}
#------------------------------------------------------------------------------
#Seperates hashtags from text
#Takes in a column of text and returns a list of hash tags
get_hashtags_df <- function(text) {
  tweets <- text
  tweets_separated <- tweets %>% str_split(pattern = " ")
  y <- list()
  p = 0
  for (i in 1:length(tweets_separated)) {
    hashtags <- list()
    for(k in 1:length(tweets_separated[[i]])){
      if(grepl(tweets_separated[[i]][k], pattern = "#.*")){
        hashtags <- append(hashtags,tweets_separated[[i]][k])
      }
    }
    #print(hashtags)
    y <- append(y,list(hashtags))
    #assign("y", y, envir = .GlobalEnv)
    #print(y)
    
    
    p = print_percent(i,length = length(tweets_separated), p = p)
    #print()#," Percent complete")
    
   #print(tweets_separated[[i]])
  }
  y
}

Read in Data

Dir <- 'Datasets/Tweets_with_Sarcasm_and_Irony'
train <- read_csv(paste0(Dir,"/train.csv"))
## Rows: 81408 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): tweets, class
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
test <- read_csv(paste0(Dir,"/test.csv"))
## Rows: 8128 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): tweets, class
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#test <- test %>% filter(!is.na(class))

Separate Hashtags from text into a new column

Dir <- 'Datasets/Tweets_with_Sarcasm_and_Irony'
train <- read_csv(paste0(Dir,"/train.csv"))
#tweets <- train$tweets
y <- get_hashtags_df(train$tweets)
train$tweets <- train$tweets %>% sub(pattern = "#.* | #.*$", replacement = "")
train <- train %>% mutate(hashtags = y)
#Note to load from disk use load("Datasets/train_w_hashtags.RData") in the markdown file
save(train, file="Datasets/train_w_hashtags.RData")



test <- read_csv(paste0(Dir,"/test.csv"))
y <- get_hashtags_df(train$tweets)
train$tweets <- train$tweets %>% sub(pattern = "#.* | #.*$", replacement = "")
train <- train %>% mutate(hashtags = y)
#Note to load from disk use load("Datasets/test_w_hashtags.RData") in the markdown file
save(test, file="Datasets/test_w_hashtags.RData")
  • Preprocessing function for cleaning tweet column
preprocessing <- function(data) {
    require('tm')
    require('stopwords')
    
    data$tweets <- data$tweets %>% sub(pattern = "@.* | @.*$", replacement = "")
    data$tweets <- tolower(data$tweets)
    data$tweets <- removePunctuation(data$tweets)
    data$tweets <- removeWords(data$tweets, words = stopwords('en'))
    data <- data  %>% filter(tweets != "")
    data
}


load('Datasets/Tweets_with_Sarcasm_and_Irony/train_w_hashtags.Rdata')
train <- preprocessing(train)
train
train$hashtags %>% head()
## [[1]]
## [[1]][[1]]
## [1] "#staylight"
## 
## [[1]][[2]]
## [1] "#staywhite"
## 
## [[1]][[3]]
## [1] "#sarcastic"
## 
## [[1]][[4]]
## [1] "#moralneeded"
## 
## 
## [[2]]
## [[2]][[1]]
## [1] "#sarcasm"
## 
## [[2]][[2]]
## [1] "#people"
## 
## [[2]][[3]]
## [1] "#diy"
## 
## [[2]][[4]]
## [1] "#artattack"
## 
## 
## [[3]]
## [[3]][[1]]
## [1] "#DailyMail"
## 
## [[3]][[2]]
## [1] "#shocker"
## 
## [[3]][[3]]
## [1] "#sarcastic"
## 
## [[3]][[4]]
## [1] "#dailyfail"
## 
## [[3]][[5]]
## [1] "#inHuntspocket"
## 
## [[3]][[6]]
## [1] "#theyhatethenhs"
## 
## 
## [[4]]
## [[4]][[1]]
## [1] "#sarcasm"
## 
## 
## [[5]]
## [[5]][[1]]
## [1] "#sarcastic"
## 
## 
## [[6]]
## [[6]][[1]]
## [1] "#Irony"
## 
## [[6]][[2]]
## [1] "#TimesChange"
train <- read.csv('Datasets/Tweets_with_Sarcasm_and_Irony/train.csv')
test <- read.csv('Datasets/Tweets_with_Sarcasm_and_Irony/test.csv')
figurativeSet <- filter(train, class=="figurative")

ironySet <- filter(train, class == "irony")

sarcasmSet <- filter(train, class =="sarcasm")

regularSet <- filter(train, class =="regular")


not_regularSet <- filter(train, class != "regular")
  • Filter tweets with their classes
freq_figurative <- as.data.frame(sort(table(unlist(strsplit(figurativeSet$tweets," "))), decreasing = TRUE), stringsAsFactors = FALSE)
summary(freq_figurative)
##      Var1                Freq         
##  Length:73284       Min.   :   1.000  
##  Class :character   1st Qu.:   1.000  
##  Mode  :character   Median :   1.000  
##                     Mean   :   4.469  
##                     3rd Qu.:   1.000  
##                     Max.   :8721.000
freq_irony <- as.data.frame(sort(table(unlist(strsplit(ironySet$tweets," "))), decreasing = TRUE), stringsAsFactors = FALSE)

freq_sarcasm <- as.data.frame(sort(table(unlist(strsplit(sarcasmSet$tweets," "))), decreasing = TRUE), stringsAsFactors = FALSE)

freq_regular <- as.data.frame(sort(table(unlist(strsplit(regularSet$tweets," "))), decreasing = TRUE), stringsAsFactors = FALSE)

freq_not_regular <- as.data.frame(sort(table(unlist(strsplit(not_regularSet$tweets," "))), decreasing = TRUE), stringsAsFactors = FALSE)
  • Collect the words and their frequencies from the set

Outliers

Figurative Class Outiers

 outlierSubsetFigurative <- subset(freq_figurative, Freq > 851, stringsAsFactors = FALSE)
  outlierSubsetFigurative %>% 
    ggplot(aes(x = reorder(Var1, order(Freq, decreasing = TRUE)), y = Freq)) +
        geom_bar(stat = 'identity')  +
          theme(axis.text.x = element_text(angle = 60, hjust = 1))

Irony Class Outiers

  outlierSubsetIrony <- subset(freq_irony, Freq > 851, stringsAsFactors = FALSE)
  outlierSubsetIrony %>% 
    ggplot(aes(x = reorder(Var1, order(Freq, decreasing = TRUE)), y = Freq)) +
        geom_bar(stat = 'identity')  +
          theme(axis.text.x = element_text(angle = 60, hjust = 1))

Sarcasm Class Outiers

  outlierSubsetSarcasm <- subset(freq_sarcasm, Freq > 851, stringsAsFactors = FALSE)
  outlierSubsetSarcasm %>% 
    ggplot(aes(x = reorder(Var1, order(Freq, decreasing = TRUE)), y = Freq)) +
        geom_bar(stat = 'identity')  +
          theme(axis.text.x = element_text(angle = 60, hjust = 1))

Regular Class Outiers

  outlierSubsetRegular <- subset(freq_regular, Freq > 851, stringsAsFactors = FALSE)
  outlierSubsetRegular %>% 
    ggplot(aes(x = reorder(Var1, order(Freq, decreasing = TRUE)), y = Freq)) +
        geom_bar(stat = 'identity')  +
          theme(axis.text.x = element_text(angle = 60, hjust = 1))

frequencys <- full_join(freq_figurative,freq_irony, by = "Var1") %>%
  full_join(freq_regular, by = "Var1") %>%
  full_join(freq_sarcasm, by = "Var1") %>%
  rename(figurative = Freq.x,
         irony = Freq.y,
         regular = Freq.x.x,
         sarcasm = Freq.y.y)


frequencys_2_class <-  full_join(freq_regular,freq_not_regular, by = "Var1") %>%
   rename(regular = Freq.x,
         not_regular = Freq.y)


frequencys_2_class[frequencys_2_class == 0] <- 1
frequencys_2_class[is.na(frequencys_2_class)] <- 1


frequencys[frequencys == 0] <- 1
frequencys[is.na(frequencys)] <- 1

frequencys <- frequencys %>%
  mutate(figurative_prop = figurative/(irony * regular * sarcasm)) %>%
  mutate(irony_prop = irony/(figurative * regular * sarcasm)) %>%
  mutate(sarcasm_prop = sarcasm/(figurative * regular * irony)) %>%
  mutate(regular_prop = regular/(figurative * sarcasm * irony)) 


frequencys_2_class <- frequencys_2_class %>% 
  mutate(prop = regular/not_regular) %>%
  mutate(inv_prop = not_regular/regular)



max = 60
#graph_freq <- function(df, max_entries = 60) {
  frequencys %>% 
  arrange(desc(regular_prop)) %>%
  slice(1:max) %>%
  ggplot(aes(y = regular_prop, x = reorder(Var1, order(regular_prop, decreasing = TRUE)))) +
  geom_bar(stat='identity') +
  ggtitle("regular Outliers") +
  theme(axis.text.x = element_text(angle = 60, hjust = 1))

#}

  frequencys_2_class %>% 
  arrange(desc(prop)) %>%
  slice(1:max) %>%
  ggplot(aes(y = prop, x = reorder(Var1, order(prop, decreasing = TRUE)))) +
  geom_bar(stat='identity') +
  ggtitle("Not regular Outliers") +
  theme(axis.text.x = element_text(angle = 60, hjust = 1))

ggwordcloud Visualization

max = 60
frequencys_tmp_prop <- frequencys_2_class %>%
  arrange(desc(prop)) %>%
  slice(1:max)

frequencys_tmp_inv_prop <- frequencys_2_class %>%
  arrange(desc(inv_prop)) %>%
  slice(1:max)


ggplot(frequencys_tmp_prop, aes(label = Var1, size = prop)) +
  geom_text_wordcloud_area(eccentricity = .54, color = 'red') +
  #scale_size_area(max_size = 30) +
  theme_minimal() +  
  ggtitle ('Words found in tweets that are considered regular')

ggplot(frequencys_tmp_inv_prop, aes(label = Var1, size = inv_prop)) +
  geom_text_wordcloud_area(eccentricity = .54, color = 'blue') +
  #scale_size_area(max_size = 30) +
  theme_minimal() +  
  ggtitle ('Words found in tweets that are not considered regular (irony, figurative, sarcasm)')
## Warning in png(filename = tmp_file, width = gw_pix, height = gh_pix, res =
## dev_dpi, : 'width=12, height=12' are unlikely values in pixels
## Warning in png(filename = tmp_file, width = gw_pix, height = gh_pix, res =
## dev_dpi, : 'width=12, height=16' are unlikely values in pixels

  • These two graphs demonstrates common words that are found in classes. The first graph demonstrates the regular class and the second demonstrates the irony, figurative and sarcasm class

Models

Results

RNN

preprocessing <- function(data) {
    require('tm')
    
    data$tweets <- data$tweets %>% sub(pattern = "@.* | @.*$", replacement = "")
    data$tweets <- tolower(data$tweets)
    data$tweets <- removePunctuation(data$tweets)
    data$tweets <- removeWords(data$tweets, words = stopwords('en'))
    #data$tweets <- data$tweets[data$tweets != ""]
    data
  }
Dir = Dir_ISarcasm
train <- read.csv(paste0(Dir,"/train.csv"), fileEncoding = 'utf-8')
train <- preprocessing(train)
training_labels <- (train$class %>% as.array() %>% as.double()) 
## Warning in train$class %>% as.array() %>% as.double(): NAs introduced by
## coercion
Dir_Main <- 'Datasets/Tweets_with_Sarcasm_and_Irony'
Dir_ISarcasm <- 'Datasets/ISarcasm'


tensorflow::tf$python$client$device_lib$list_local_devices() %>% print()
## [[1]]
## name: "/device:CPU:0"
## device_type: "CPU"
## memory_limit: 268435456
## locality {
## }
## incarnation: 7655193342591569220
## xla_global_id: -1
## 
## 
## [[2]]
## name: "/device:GPU:0"
## device_type: "GPU"
## memory_limit: 5719982080
## locality {
##   bus_id: 1
##   links {
##   }
## }
## incarnation: 2759015377138024622
## physical_device_desc: "device: 0, name: NVIDIA GeForce RTX 3070, pci bus id: 0000:01:00.0, compute capability: 8.6"
## xla_global_id: 416903419
#-------------------------------------------------------------------
even_out_observations <- function(data){
  regular <- data %>% filter(class == 0)
  sarcasm <- data %>% filter(class == 1)
  #sarcasm$class = "sarcasm"
  num_regular <- regular %>% nrow() 
  sarcasm <- sarcasm[1:num_regular,]
  data <- rbind(regular,sarcasm)
  data <- data[sample(1:nrow(data)), ]
  data
}

#-------------------------------------------------------------------
retrieve_dataset_ISarcasm <- function(Dir = Dir_ISarcasm, binary = FALSE) {
  train <- read.csv(paste0(Dir,"/train.csv"), fileEncoding = 'utf-8')# %>% rename(tweets = tweet, class = sarcastic)
  test <- read.csv(paste0(Dir,"/test.csv"),fileEncoding = 'utf-8') #%>% rename(tweets = tweet, class = sarcastic)
  preprocessing <- function(data) {
    require('tm')
    
    data$tweets <- data$tweets %>% sub(pattern = "@.* | @.*$", replacement = "")
    data$tweets <- tolower(data$tweets)
    data$tweets <- removePunctuation(data$tweets)
    data$tweets <- removeWords(data$tweets, words = stopwords('en'))
    #data$tweets <- data$tweets[data$tweets != ""]
    data
  }
  
  
  train <- preprocessing(train)
  test <- preprocessing(test)

  factor_set <- function(set) {
    set$class[set$class == 'regular'] = 0
    set$class[set$class == 'sarcasm'] = 1
    
    if(!binary) {
      set$class[set$class == 'figurative'] = 2
      set$class[set$class == 'irony'] = 3
    } else {
      set$class[set$class == 'figurative'] = 1
      set$class[set$class == 'irony'] = 1
    }
    set
  }

  train <- factor_set(train)
  test <- factor_set(test)

  index <- createDataPartition(train$class, p = .8, list = FALSE)
  
  train <- train[index,]
  validation <- train[-index,]
  
  
  training_labels <- (train$class %>% as.array() %>% as.double()) #normalize
  validation_labels <- (validation$class %>% as.array() %>% as.double()) #normalize
  test_labels <-  (test$class %>% as.array() %>% as.double()) 
  
  
  list(train_set = train,
       train_labels = training_labels,
       test_set = test,
       test_labels = test_labels,
       validation_set = validation,
       validation_labels = validation_labels)
}

#-------------------------------------------------------------------
#load('Datasets/Tweets_with_Sarcasm_and_Irony/test_w_hashtags.RData')
retrieve_dataset <- function(Dir = 'Datasets/Tweets_with_Sarcasm_and_Irony', binary = FALSE, even_out = FALSE, without_hashtags = FALSE) {
  
  
  if(!without_hashtags){
    train <- read_csv(paste0(Dir,"/train.csv"))
    test <- read_csv(paste0(Dir,"/test.csv"))
  } else {
    train <- read_csv(paste0(Dir,"/train_without_hashtags.csv"))
    test <- read_csv(paste0(Dir,"/test_without_hashtags.csv"))
  }
  
  #load('Datasets/Tweets_with_Sarcasm_and_Irony/test_w_hashtags.RData')
  #load('Datasets/Tweets_with_Sarcasm_and_Irony/train_w_hashtags.RData')
 # train <- read_csv(paste0(Dir,"/train_without_hashtags.csv"))
  #test <- read_csv(paste0(Dir,"/test_without_hashtags.csv"))
  
  
  preprocessing <- function(data) {
    require('tm')
    data$tweets <- tolower(data$tweets)
    data$tweets <- removePunctuation(data$tweets)
    data$tweets <- removeWords(data$tweets, words = stopwords('en'))
    data
  }
  
  train <- preprocessing(train)
  test <- preprocessing(test)
  
  
  
  test <- test %>% filter(!is.na(class))
  
  factor_set <- function(set) {
    set$class[set$class == 'regular'] = 0
    set$class[set$class == 'sarcasm'] = 1
    
    if(!binary) {
      set$class[set$class == 'figurative'] = 2
      set$class[set$class == 'irony'] = 3
    } else {
      set$class[set$class == 'figurative'] = 1
      set$class[set$class == 'irony'] = 1
    }
    set
  }
  train <- factor_set(train)
  test <- factor_set(test)
  
  index <- createDataPartition(train$class, p = .8, list = FALSE)
  train <- train[index,]
  validation <- train[-index,]
  
  if(even_out && binary){
    train <- even_out_observations(train)
    test <- even_out_observations(test)
    validation <- even_out_observations(validation)
  }
   
  training_labels <- (train$class %>% as.array() %>% as.double()) #normalize
  validation_labels <- (validation$class %>% as.array() %>% as.double()) #normalize
  test_labels <-  (test$class %>% as.array() %>% as.double()) 
  
  
  
  list(train_set = train,
       train_labels = training_labels,
       test_set = test,
       test_labels = test_labels,
       validation_set = validation,
       validation_labels = validation_labels)
}
#--------------------------------------------------------------------------------
generate_sequences <- function(train_data,#training data
                               validation_data,# validation data
                               testing_data,
                               maxlen = 50,#maximum length of the embedding sequence
                               max_words = 2000,
                               tokenizer = NULL)#will only choose consider max_words amount of words for the embedding
{
  
 
  training_text <- train_data$tweets %>% as.array()#get the text
  validation_text <- validation_data$tweets %>% as.array()#get the text
  testing_text <- testing_data$tweets %>% as.array()
  
  
  if(is.null(tokenizer)) {
    tokenizer <- text_tokenizer(num_words = max_words) %>%#create and fit tokenizer
    fit_text_tokenizer(training_text)
    print('creating Tokenizer.....')
  } else {
    print('found tokenizer!')
  }
  
  sequences <- texts_to_sequences(tokenizer,training_text) #Translates text to sequences of integers(use the tokenizer$word_index to know which int maps to what word)
  training_sequences <- pad_sequences(sequences, maxlen = maxlen)#make all sequences the same length with the length being maxlen
  sequences <- texts_to_sequences(tokenizer,validation_text) #Translates text to sequences of integers(use the tokenizer$word_index to know which int maps to what word)
  validation_sequences <- pad_sequences(sequences, maxlen = maxlen)#make all sequences the same length with the length being maxlen
  sequences <- texts_to_sequences(tokenizer,testing_text)
  testing_sequences <- pad_sequences(sequences, maxlen = maxlen)
  
  
  
  list(train = training_sequences,
       validation = validation_sequences,
       test = testing_sequences,
       tokenizer = tokenizer
       )
}
#-------------------------------------------------------------------------------------------------------------------
Accuracy_Label_Table <- function (Labels, Guesses) {
  Value_P <- function(Label, Guess){
  bin <- as.integer( #Returns int equivalent of binary value Label,Guess
    strtoi(
      paste0(Label * 10 + Guess), 
      base = 2
      )
    )
  
    arr <- c("TN", #Label = 0, Guess = 0
             "FP", #Label = 0, Guess = 1
             "FN", #Label = 1, Guess = 0
             "TP" #Label = 1, Guess = 1
             )
    return(arr[bin+1])
  
  
  }
  
  result <- map2(.x = Labels, .y = Guesses,.f = Value_P) %>% unlist()

  TN_Count <- result[result == "TN"] %>% length()
  FP_Count <- result[result == "FP"] %>% length()
  FN_Count <- result[result == "FN"] %>% length()
  TP_Count <- result[result == "TP"] %>% length()
  
  
  group = c("True Negative (TN)", #Label = 0, Guess = 0
             "False Positive (FP)", #Label = 0, Guess = 1
             "False Negative (FN)", #Label = 1, Guess = 0

"True Positive (TP)" #Label = 1, Guess = 1
             )
  value = c(TN_Count,
            FP_Count,
            FN_Count,
            TP_Count)
  
  data.frame(group = group,
             value = value)
}
#-------------------------------------------------------------------------------
FP_Pie_Chart <- function(Labels, Guesses) {
  a_table <- Accuracy_Label_Table(Labels = Labels,
                     Guesses = Guesses)

  N_Acc <- round(a_table[1,2] / (a_table[1,2] + a_table[3,2]), digits = 4)
  P_Acc <- round(a_table[4,2] / (a_table[4,2] + a_table[2,2]), digits = 4)
  Acc <- round((a_table[1,2] + a_table[4,2]) / (a_table[1,2] + a_table[3,2] + a_table[4,2] + a_table[2,2]), digits = 4)
  
  plt <- a_table %>%
    ggplot(aes(x = "", y = value, fill = group)) +
    geom_col() + 
    geom_label(aes(label = value),
               position = position_stack(vjust = 0.5),
               show.legend = FALSE) +
    coord_polar(theta = "y") +
    scale_fill_manual(values = c("#FFABAB", "#FFB092",
                                 "#b4d4fa", "#BFFCC6"),
                      guide = guide_legend(reverse = TRUE)) + 
    ggtitle("TP, TN, FP, FN Pie Chart") +
    theme_void()
  
  plt <- ggdraw(plt)
  
  plt <- plt +
    annotation_custom(grob = textGrob(paste0("Accuracy Positive: ",P_Acc)),  xmin = 1 - .2, xmax = 1 - .2, ymin = 1 - .025, ymax = 1- .025) +
    annotation_custom(grob = textGrob(paste0("Accuracy Negative: ",N_Acc)),  xmin = 1 - .2, xmax = 1 - .2, ymin = 1 - .025 - .05, ymax = 1- .025 - .05) +
    annotation_custom(grob = textGrob(paste0("Total Accuracy: ",Acc)),  xmin = 1 - .2, xmax = 1 - .2, ymin = 1 - .025 - .1, ymax = 1- .025 - .1)
  plt
}
#-------------------------------------------------------------------------------------------
one_hot_encode <- function(train,validation,test, max_words, tokenizer = NA) {
  
  training_text <- train %>% as.array()
  validation_text <- validation %>% as.array()
  testing_text <- test %>% as.array()
  
  if(!is.na(tokenizer)){
    tokenizer <- text_tokenizer(num_words = max_words) %>%
    fit_text_tokenizer(training_text)
  }
  
  train_one_hot_matrix <- texts_to_matrix(tokenizer, training_text, mode = "binary")#Translates text to a matrix of 0 or 1 where 0 == word NOT present and 1 == word present
  #word_index <- tokenizer$word_index #The dictionary to translate a sequence to a sentence
  validation_one_hot_matrix <- texts_to_matrix(tokenizer, validation_text, mode = "binary")
  test_one_hot_matrix <- texts_to_matrix(tokenizer, testing_text, mode = "binary")
  
  list(train = train_one_hot_matrix,
       valdiation = validation_one_hot_matrix,
       test = test_one_hot_matrix,
    tokenizer = tokenizer)
}

Datasets

Tweets with Sarcasm Data set with Hashtags
  • Click Code to show model code
max_words = 1000
embedding_dim = 8
maxlen = 50


sets <- retrieve_dataset(binary = TRUE
                         )

train <- sets$train_set
training_labels <- sets$train_labels

validation <- sets$validation_set
validation_labels <- sets$validation_labels

test <- sets$test_set
test_labels <- sets$test_labels



sequences <- generate_sequences(train,
                                validation,
                                test,
                                maxlen = maxlen,
                                max_words = max_words)
training_sequences <- sequences$train
validation_sequences <- sequences$validation
test_sequences <- sequences$test

model <-  keras_model_sequential() %>%
  layer_embedding(input_dim = max_words,
                  output_dim = embedding_dim,
                  input_length = maxlen) %>%
  bidirectional(layer_lstm(units = 128, return_sequences = TRUE))%>%
  layer_lstm(units = 64, return_sequences = FALSE) %>%
  layer_flatten() %>%
  layer_dense(units = 1, 
              activation = "sigmoid") 

model %>% compile(
  optimizer = "rmsprop",
  loss = "binary_crossentropy",
  metrics = "accuracy"
)

history <- model %>% fit(
  training_sequences,
  training_labels,
  epochs = 5,
  batch_size = 128,
  validation_data= list(validation_sequences,validation_labels)
)


results <- model %>% evaluate(test_sequences,test_labels)
results
Predictions and visualizing accuracy
## Rows: 81408 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): tweets, class
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 8128 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): tweets, class
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## [1] "found tokenizer!"

Tweets with Sarcasm Data set without Hashtags
  • Click Code to show the model code
max_words = 1000
embedding_dim = 8
maxlen = 50


sets <- retrieve_dataset(binary = TRUE,
                         without_hashtags = TRUE
                         )

train <- sets$train_set
training_labels <- sets$train_labels

validation <- sets$validation_set
validation_labels <- sets$validation_labels

test <- sets$test_set
test_labels <- sets$test_labels



sequences <- generate_sequences(train,
                                validation,
                                test,
                                maxlen = maxlen,
                                max_words = max_words)
training_sequences <- sequences$train
validation_sequences <- sequences$validation
test_sequences <- sequences$test

model <-  keras_model_sequential() %>%
  layer_embedding(input_dim = max_words,
                  output_dim = embedding_dim,
                  input_length = maxlen) %>%
  bidirectional(layer_lstm(units = 128, return_sequences = TRUE))%>%
  layer_lstm(units = 64, return_sequences = FALSE) %>%
  layer_flatten() %>%
  layer_dense(units = 1, 
              activation = "sigmoid") 

model %>% compile(
  optimizer = "rmsprop",
  loss = "binary_crossentropy",
  metrics = "accuracy"
)

history <- model %>% fit(
  training_sequences,
  training_labels,
  epochs = 5,
  batch_size = 128,
  validation_data= list(validation_sequences,validation_labels)
)


results <- model %>% evaluate(test_sequences,test_labels)
results
Predictions and visualizing accuracy
## New names:
## Rows: 81408 Columns: 3
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (2): tweets, class dbl (1): ...1
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## New names:
## Rows: 8128 Columns: 3
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (2): tweets, class dbl (1): ...1
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
## [1] "found tokenizer!"

ISarcasm
  • Click Code to show model code
max_words = 1000
embedding_dim = 8
maxlen = 50


sets <- retrieve_dataset_ISarcasm(binary = TRUE
                         )

train <- sets$train_set
training_labels <- sets$train_labels

validation <- sets$validation_set
validation_labels <- sets$validation_labels

test <- sets$test_set
test_labels <- sets$test_labels



sequences <- generate_sequences(train,
                                validation,
                                test,
                                maxlen = maxlen,
                                max_words = max_words)
training_sequences <- sequences$train
validation_sequences <- sequences$validation
test_sequences <- sequences$test

model <-  keras_model_sequential() %>%
  layer_embedding(input_dim = max_words,
                  output_dim = embedding_dim,
                  input_length = maxlen) %>%
  bidirectional(layer_lstm(units = 128, return_sequences = TRUE))%>%
  layer_lstm(units = 64, return_sequences = FALSE) %>%
  layer_flatten() %>%
  layer_dense(units = 1, 
              activation = "sigmoid") 

model %>% compile(
  optimizer = "rmsprop",
  loss = "binary_crossentropy",
  metrics = "accuracy"
)

history <- model %>% fit(
  training_sequences,
  training_labels,
  epochs = 17,
  batch_size = 128,
  validation_data= list(validation_sequences,validation_labels)
)


results <- model %>% evaluate(test_sequences,test_labels)
results
Predictions and visualizing accuracy
## [1] "found tokenizer!"

SVM

Random Forest

Decision Trees

XgBoost